Games of Daze

home *** CD-ROM | disk | FTP | other *** search

/ Games of Daze / Infomagic - Games of Daze (Summer 1995) (Disc 1 of 2).iso / x2ftp / msdos / ai / fuzzy / token.b < prev next >

Wrap

Text File | 1986-11-29 | 31KB | 800 lines

------------------------------------------------------------------------------- -- -- -- Library Unit: Token -- Get token package -- -- -- -- Author: Bradley L. Richards -- -- -- -- Version Date Notes . . . -- -- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -- -- 1.0 6 Feb 86 Initial Version -- -- 1.1 25 Feb 86 All basic routines completed -- -- 1.2 13 Mar 86 Added reserved words. Split Ada and Fuzzy -- -- Prolog via conditional compilation -- -- 1.3 22 May 86 Revised lots of Fuzzy Prolog stuff to make it -- -- work; adding reserved words, etc.. -- -- 1.4 19 Jun 86 Use revised io package and data_def -- -- 2.0 20 Jun 86 Token_type extracted into package Data_def -- -- 2.05 13 Jul 86 Split into separate spec and body files -- -- 2.1 21 Jul 86 Demonstration Version -- -- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -- -- -- -- Library units used: io -- read source file and produce listing -- -- listing -- insert messages in listing -- -- data_def -- common data definitions -- -- -- -- Description: This package reads the source file (via "io") and parses -- -- out individual tokens. There are three defined types which calling -- -- routines must use. Token_type defines the legal kinds of tokens -- -- which may be returned. Token_record is a variant record (with a -- -- discriminant of token_type) which is used to hold tokens. Finally, -- -- token_ptr is the access type to the dynamically allocated -- -- token_records. -- -- This package must be initialized by calling start_token. Then -- -- subsequent calls to get_token will return one token each. Note -- -- that end-of-file is indicated by the special token "end_of_file." -- -- -- -- Note that this package is designed to be used with multiple -- -- languages. In order to share the code and ensure that generic -- -- changes are made to all versions, this package is kept in a master -- -- source file which contains conditional compilation directives -- -- for use by the Ada preprocessor "pp." -- -- -- ------------------------------------------------------------------------------- -- -- -- Package Body -- -- -- ------------------------------------------------------------------------------- package body token is seen_end_of_file : boolean; procedure get_token is separate; -------------------------------------------- -- Initialization and Utility functions -- -- in alphabetical order -- -------------------------------------------- -- -- start_token -- This routine makes the first call to get_char, so that -- get_token will have something to look at. -- procedure start_token(source_file, listing_file : in string) is begin seen_end_of_file := false; -- -- for Fuzzy Prolog use a lookahead of 1. A lookahead of 2 is too -- awkward to interactive i/o -- start_io(source_file, listing_file, 1); start_listing; get_char; -- get first character end start_token; -- -- stop_token -- This routine cleans up whatever needs to be done at the -- end of parsing a file -- procedure stop_token is begin stop_io; end stop_token; -- -- skip_rest_of_token -- When a token_fetching routine encounters an error -- it generally returns the last valid value it had, -- and then wants to skip over the rest of the -- erroneous token. This routine implements this by -- skipping characters until it encounters one which -- should not be embedded in a token. -- Note that it will stop on many characters which -- are not legal delimiters in the language. This -- allows these characters to be flagged as seperate -- errors. -- procedure skip_rest_of_token is begin while not valid_ending(look_ahead_char) loop get_char; end loop; end skip_rest_of_token; -- -- valid_ending -- This routine looks for characters which signal the end -- of a token. These characters may or may not technically -- be delimiters. -- -- Characters accepted by this routine are: -- -- space ! $ % & ( ) * + , - / : ; < = > ? @ [ \ ] ^ { | } ~ eot cr tab -- -- Characters not accepted: control characters (except eot, tab, & cr) -- digits, letters -- " # ' . _ ` ascii.rub -- function valid_ending( char : character ) return boolean is begin if (char in ' '..'!') or (char in '$'..'&') or (char in '('..'-') or (char = '/') or (char in ':'..'@') or (char in '['..'^') or (char in '{'..'~') or (char = ascii.ht) or (char = ascii.eot) or (char = ascii.cr) then return true; else return false; end if; end valid_ending; end token; -- -- get_token -- This routine parses and returns the next token in the source -- file. It expects look_ahead_char (the package variable) to be set -- to the next character to be processed. The current character, -- therefore, has already been parsed. All subroutines which -- get_token calls to handle the various token types must follow -- this convention. The subroutine which finally identifies the -- token is allocates and defines the token_record which is to -- be returned. The only time get_token itself defines the -- token_record is for the end_of_file. -- separate(token) procedure get_token is token : token_ptr; have_token : boolean := false; -- -- All token handling routines are separate. This makes the code -- easier to read than if several hundred lines were embedded here. -- procedure get_character( token : out token_ptr) is separate; procedure get_comment_or_minus( token : out token_ptr) is separate; procedure get_greater_than( token : out token_ptr) is separate; procedure get_identifier( token : out token_ptr) is separate; procedure get_number( token : out token_ptr) is separate; procedure get_string( token : out token_ptr) is separate; procedure get_fuzzy_backslash( token : out token_ptr) is separate; procedure get_fuzzy_colon( token : out token_ptr) is separate; procedure get_fuzzy_equal( token : out token_ptr) is separate; procedure get_fuzzy_underline( token : out token_ptr) is separate; begin -- get_token loop case look_ahead_char is -- -- skip embedded spaces -- when ' ' => loop get_char; exit when look_ahead_char /= ' '; end loop; -- -- handle multi-character tokens -- when 'A'..'Z' => get_identifier(token); have_token := true; when 'a'..'z' => get_identifier(token); have_token := true; when '0'..'9' => get_number(token); have_token := true; when '-' | '{' => get_comment_or_minus(token); if token.is_a /= null_token then have_token := true; end if; when '"' => get_string(token); have_token := true; when ''' => get_character(token); have_token := true; when '>' => get_greater_than(token); have_token := true; -- -- special cases: tab, end-of-line, and end-of-file -- when ascii.ht => get_char; when ascii.cr => get_char; when ascii.eot => token := new token_record'(is_a => end_of_file); have_token := true; if seen_end_of_file = true then raise unexpected_end_of_file; else seen_end_of_file := true; end if; when '=' => get_fuzzy_equal(token); if token.is_a /= null_token then have_token := true; end if; when ':' => get_fuzzy_colon(token); if token.is_a /= null_token then have_token := true; end if; -- -- now a sub-case statement to handle single character tokens -- when others => get_char; have_token := true; -- almost certainly true case current_char is when '(' => token := new token_record'(is_a => left_paren); when ')' => token := new token_record'(is_a => right_paren); when '*' => token := new token_record'(is_a => asterisk); when '+' => token := new token_record'(is_a => plus); when ',' => token := new token_record'(is_a => comma); when ';' => token := new token_record'(is_a => semicolon); when '|' => token := new token_record'(is_a => bar); when '!' => token := new token_record'(is_a => cut); when '/' => token := new token_record'(is_a => slash); when '<' => token := new token_record'(is_a => less_than); when '[' => token := new token_record'(is_a => left_bracket); when ']' => token := new token_record'(is_a => right_bracket); when '^' => token := new token_record'(is_a => hat); when '.' => token := new token_record'(is_a => period); when '\' => get_fuzzy_backslash(token); if token.is_a /= null_token then have_token := true; end if; when '_' => get_fuzzy_underline(token); if token.is_a /= null_token then have_token := true; end if; when others => error(pointer,"illegal character"); skip_rest_of_token; have_token := false; end case; end case; exit when have_token; end loop; current_token := token; end get_token; ------------------------------------------------------------------------------- -- -- -- Token Fetching Routines -- -- -- ------------------------------------------------------------------------------- -- -- get_character -- This subroutine expects to see a single character enclosed -- in single quotes. Since this syntax is strictly defined, -- there is no confusion when the character is a single -- -- Syntax: character_spec ::= ' ascii.character ' -- -- Examples: 'a' 'M' '#' ''' 'z' -- separate(token.get_token) procedure get_character( token : out token_ptr) is begin get_char; if (look_ahead_char = ascii.eot) or (look_ahead_char = ascii.cr) then error(pointer,"invalid character literal"); else get_char; token := new token_record'(character_lit, current_char); if look_ahead_char /= ''' then error(pointer,"invalid character literal"); skip_rest_of_token; else get_char; end if; end if; end get_character; -- -- get_comment_or_minus -- This routine handles two forms of comments. The -- first is initiated by two adjacent dashes and -- terminated by the end-of-line. If only a single -- dash is found, a "minus" token is returned. The -- second form of comment is enclosed within scroll -- brackets, and may cover multiple lines. Nesting -- level of the brackets is tracked, so comments may -- be nested. Comments return a "null_token." -- -- Syntax: minus ::= '-' -- comment ::= '--' comment 'ascii.cr' | '{' comment '}' -- separate(token.get_token) procedure get_comment_or_minus( token : out token_ptr) is nesting_level : natural := 1; begin get_char; if current_char = '-' then if look_ahead_char /= '-' then token := new token_record'(is_a => minus); else loop get_char; exit when (current_char = ascii.cr) or (current_char = ascii.eot); end loop; token := new token_record'(is_a => null_token); end if; else -- current_char = '{' loop get_char; if current_char = '}' then nesting_level := nesting_level - 1; elsif current_char = '{' then nesting_level := nesting_level + 1; elsif current_char = ascii.eot then error(pointer, "unterminated comment block"); nesting_level := 0; end if; exit when (nesting_level = 0); end loop; token := new token_record'(is_a => null_token); end if; end get_comment_or_minus; -- -- get_greater_than -- The two tokens beginning with '>' are the -- "greater_than" and the "greater_or_equal" -- separate(token.get_token) procedure get_greater_than( token : out token_ptr) is begin get_char; if look_ahead_char = '=' then get_char; token := new token_record'(is_a => greater_or_equal); else token := new token_record'(is_a => greater_than); end if; end get_greater_than; -- -- get_identifier -- Identifiers must begin with a letter (either upper or -- lower case), and may then contain both letters and -- digits. Underlines may be embedded, but must separate -- letters and digits. Case is significant, only for -- Fuzzy Prolog, and only in that the first character, if -- capitalized, indicates that the identifier is a variable. -- Underlines are significant in all identifiers. The only -- limit on identifier length is line length, which is -- controlled by package "special.io" -- -- Syntax: identifier ::= letter { [ '_' ] letter_or_digit } -- separate(token.get_token) procedure get_identifier( token : out token_ptr) is ptr : integer range 0..io.max_line_length := 0; ident_name : string(1..io.max_line_length) := (others => ' '); ident : name_ptr; err_flg : boolean := false; reserved : boolean := false; convert : constant integer := character'pos('a') - character'pos('A'); var_flg : boolean := false; -- -- This routine checks the identifier against the list of reserved -- words. If it is reserved, then "token" is set appropriately and -- reserved is true. The search method used is a simplistic hash table. -- procedure check_reserved(length : in integer; ident : in string; reserved : out boolean; token : out token_ptr) is type word_record is record word : string(1..9); rw_token : token_type; end record; char_pos : constant array ('A'..'Z',1..2) of integer := ( (1,4), (1,0), (5,7), (8,9), (1,0), (10,13), (14,15), (1,0), (16,17), (1,0), (1,0), (18,20), (21,21), (22,29), (30,31), (32,33), (1,0), (34,37), (38,42), (43,49), (50,50), (51,51), (52,52), (1,0), (1,0), (1,0) ); words : constant array(1..52) of word_record := ( ("ASSERTA ", rw_asserta), ("ASSERTZ ", rw_assertz), ("ATOM ", rw_atom), ("ATOMIC ", rw_atomic), ("CALL ", rw_call), ("CLAUSE ", rw_clause), ("CONSULT ", rw_consult), ("DEBUGGING", rw_debugging), ("DISPLAY ", rw_display), ("FAIL ", rw_fail), ("FLOAT ", rw_float), ("FUNCTOR ", rw_functor), ("FUZZY ", rw_fuzzy), ("GET ", rw_get), ("GET0 ", rw_get0), ("INTEGER ", rw_integer), ("IS ", rw_is), ("LISTING ", rw_listing), ("LN ", rw_ln), ("LOG ", rw_log), ("MOD ", rw_mod), ("NAME ", rw_name), ("NL ", rw_nl), ("NODEBUG ", rw_nodebug), ("NONVAR ", rw_nonvar), ("NOSPY ", rw_nospy), ("NOT ", rw_not), ("NOTRACE ", rw_notrace), ("NUMBER ", rw_number), ("OP ", rw_op), ("ORG ", rw_org), ("PARSE ", rw_parse), ("PUT ", rw_put), ("READ ", rw_read), ("REPEAT ", rw_repeat), ("RESET ", rw_reset), ("RETRACT ", rw_retract), ("SEE ", rw_see), ("SEEING ", rw_seeing), ("SEEN ", rw_seen), ("SKIP ", rw_skip), ("SPY ", rw_spy), ("TAB ", rw_tab), ("TELL ", rw_tell), ("TELLING ", rw_telling), ("THRESHOLD", rw_threshold), ("TOLD ", rw_told), ("TRACE ", rw_trace), ("TRUE ", rw_true), ("USER ", rw_user), ("VAR ", rw_var), ("WRITE ", rw_write) ); fail, found : boolean := false; which : integer; begin for i in char_pos(ident(1),1) .. char_pos(ident(1),2) loop for j in 2..length loop if ident(j) < words(i).word(j) then fail := true; elsif ident(j) > words(i).word(j) then exit; else if j = length then if j = 9 then found := true; which := i; elsif words(i).word(j+1) = ' ' then found := true; which := i; else exit; end if; end if; end if; exit when fail or found; end loop; exit when fail or found; end loop; if found then token := new token_record'(reserved_word, words(which).rw_token); end if; reserved := found; end check_reserved; begin if look_ahead_char in 'A'..'Z' then -- it's a Fuzzy Prolog variable var_flg := true; end if; loop get_char; ptr := ptr + 1; if (current_char = '_') or else (current_char in 'A'..'Z') or else (current_char in 'a'..'z') or else (current_char in '0'..'9') then ident_name(ptr) := current_char; else error(pointer,"invalid character in identifier"); skip_rest_of_token; err_flg := true; exit; end if; if current_char = '_' and (not ((look_ahead_char in 'A'..'Z') or else (look_ahead_char in 'a'..'z') or else (look_ahead_char in '0'..'9'))) then error(pointer,"underlines must separate letters or digits"); skip_rest_of_token; err_flg := true; end if; if current_char in 'a'..'z' then ident_name(ptr) := character'val(character'pos(current_char) - convert); end if; exit when valid_ending(look_ahead_char) or err_flg; exit when look_ahead_char = '.'; -- required to detect end of clause end loop; if ptr <= 9 then check_reserved(ptr, ident_name(1..ptr), reserved, token); end if; if reserved then if var_flg then error(pointer,"reserved words may not begin with capital letters"); end if; elsif var_flg then ident := new name_record'(ptr, ident_name(1..ptr)); token := new token_record'(variable, ident); else ident := new name_record'(ptr, ident_name(1..ptr)); token := new token_record'(identifier, ident); end if; end get_identifier; -- -- get_number -- This subroutine parses tokens which begin with a digit. This -- means integer and floating point numbers, either of which may -- be based (legal bases are 2-16). -- -- Syntax: number ::= value | based_value -- based_value ::= base '#' value '#' -- base ::= integer -- value ::= integer | float -- integer ::= digit { ['_'] digit } -- float ::= integer '.' integer -- separate(token.get_token) procedure get_number( token : out token_ptr ) is base : integer := 10; -- default is base 10 digit : integer; fp_decimal : float := 1.0; -- factor for digits after decimal point fp_num : float; int_num : integer := 0; -- initial value is 0 based, done, err_flg, fp : boolean := false; max_int_div_10 : constant integer := (integer'last/10); max_int_last_digit : constant integer := (integer'last - 10*max_int_div_10); -- -- digit_val -- Converts a single character to a number in the current -- base. No error checking; the character must have been -- checked by is_a_digit. -- function digit_val(char : in character; base : in integer) return integer is char_val : integer; begin char_val := character'pos(char) - character'pos('0'); if char_val > 9 then -- letter A-F or a-f if char >= 'a' then -- lower case char_val := char_val - 39; else -- upper case char_val := char_val - 7; end if; end if; return char_val; end digit_val; -- -- is_a_digit -- check a character to see if it is a valid digit in the -- current base. -- function is_a_digit(char : in character; base : in integer) return boolean is char_pos : integer; begin char_pos := character'pos(char) - character'pos('0'); if char_pos < 0 then -- below digits return false; elsif char_pos < 10 then -- it's a digit if char_pos < base then -- within the base return true; else return false; end if; elsif char_pos < 17 then return false; elsif char_pos < (base + 7) then -- a digit A-F in the base return true; end if; char_pos := char_pos - 32; -- check for lower case if (char_pos < 17) or (char_pos >= (base+7)) then return false; else return true; end if; end is_a_digit; begin -- get_number loop get_char; -- get the next numeric char if is_a_digit(current_char, base) then digit := digit_val(current_char, base); if fp then -- we're building a floating point number fp_decimal := fp_decimal / float(base); -- adjust value of digit fp_num := fp_num + float(digit) * fp_decimal; else -- an integer (at least, so far) if (int_num > max_int_div_10) or ((int_num = max_int_div_10) and (digit > max_int_last_digit)) then error(pointer,"integer too large"); err_flg := true; else int_num := int_num * base + digit; end if; end if; elsif current_char = '_' then -- ignore underline when separating digits if not is_a_digit(look_ahead_char, base) then error(pointer,"underline must separate digits"); err_flg := true; end if; elsif current_char = '#' then -- deal with based number if based then -- already working on a based number so this is the end done := true; else -- if legal, current value becomes the new base if fp or (int_num < 2) or (int_num > 16) then -- illegal error(pointer,"illegal base"); err_flg := true; else base := int_num; int_num := 0; based := true; if not is_a_digit(look_ahead_char, base) then error(pointer,"base declaration must be followed by" & " an appropriate based number"); err_flg := true; end if; end if; end if; elsif current_char = '.' then -- deal with floating point number if fp then error(pointer,"extra decimal point"); err_flg := true; elsif not is_a_digit(look_ahead_char, base) then error(pointer,"decimal point must be followed by digit"); err_flg := true; else -- current value is to left of decimal point in fp_num fp_num := float(int_num); fp := true; end if; else -- we don't know what the heck we got . . . if based then error(pointer,"illegal character in based number"); else error(pointer,"illegal character in number"); end if; err_flg := true; end if; if valid_ending(look_ahead_char) then -- at end of number done := true; end if; -- -- special case: if a floating point number has been found and a -- period is coming, it may be the end of the clause. -- For example: a(X) :- X is 7.0. -- There appears to be no reasonable way to allow this -- with integers, however. -- if fp and (look_ahead_char = '.') then done := true; end if; exit when (err_flg or done); end loop; if err_flg then -- skip rest of number; return last valid value skip_rest_of_token; end if; -- -- now define a new token record according to the type of number we've got -- if fp then token := new token_record'(float_num, fp_num); else token := new token_record'(integer_num, int_num); end if; end get_number; -- -- get_string -- Parses string literals delimited by double quotes ('"'). -- A double quote may be embedded in a string by placing two -- of them side by side (example: "abc""def" --> abc"def). -- A null string (zero length) may be specified by not -- enclosing any characters within the quotes (example: -- "" --> null string). Strings may not overlap end-of-line. -- Line length is controlled by package "io" -- -- syntax: string ::= '"' text_of_string '"' -- separate(token.get_token) procedure get_string( token : out token_ptr) is ptr : integer range 0..io.max_line_length := 0; string_value : string(1..io.max_line_length) := (others => ' '); ident : name_ptr; begin get_char; loop if (look_ahead_char = ascii.cr) or (look_ahead_char = ascii.eot) then error(pointer,"no terminating '""' for string"); get_char; exit; elsif look_ahead_char = '"' then get_char; -- throw away '"' if look_ahead_char = '"' then -- embedded '"' null; else -- done with this string exit; end if; end if; get_char; ptr := ptr + 1; string_value(ptr) := current_char; end loop; if ptr = 0 then error(pointer,"null string not allowed"); string_value(1) := '?'; ptr := 1; end if; ident := new name_record'(ptr, string_value(1..ptr)); token := new token_record'(identifier, ident); end get_string; ----------------------------- -- Fuzzy Prolog routines -- ----------------------------- -- -- get_fuzzy_backslash -- The backslash may be the start of either of the -- non-equality operators in Fuzzy Prolog. These -- are '\=' and '\==' -- separate(token.get_token) procedure get_fuzzy_backslash( token : out token_ptr) is begin if look_ahead_char /= '=' then error(pointer,"must be '\=' or '\=='"); token := new token_record'(is_a => null_token); skip_rest_of_token; else get_char; if look_ahead_char = '=' then -- '\==' token := new token_record'(is_a => not_equality); get_char; else -- '\=' token := new token_record'(is_a => not_equal); end if; end if; end get_fuzzy_backslash; -- -- get_fuzzy_colon -- The colon may only be the start of the implication -- token ':-' -- separate(token.get_token) procedure get_fuzzy_colon( token : out token_ptr) is begin get_char; if look_ahead_char /= '-' then error(pointer,"must be ':-'"); token := new token_record'(is_a => null_token); skip_rest_of_token; else token := new token_record'(is_a => implication); get_char; end if; end get_fuzzy_colon; -- -- get_fuzzy_equal -- The equal may be an equality test ('='), or may be -- the start of equality ('=='), less than or equal to -- ('=<'), or univ ('=..') -- separate(token.get_token) procedure get_fuzzy_equal( token : out token_ptr) is begin get_char; if look_ahead_char = '=' then -- equality token := new token_record'(is_a => equality); get_char; elsif look_ahead_char = '<' then -- less than or equal to token := new token_record'(is_a => less_or_equal); get_char; elsif look_ahead_char = '.' then -- may be univ? get_char; if look_ahead_char /= '.' then -- oops error(pointer,"univ must be '=..'"); skip_rest_of_token; token := new token_record'(is_a => null_token); else -- univ token := new token_record'(is_a => univ); get_char; end if; else -- just plain old '=' (equal) token := new token_record(equal); end if; end get_fuzzy_equal; -- -- get_fuzzy_underline -- The underline in Fuzzy Prolog represents an -- anonymous variable. This routine merely ensures -- that the underline is followed by a valid delimiter -- separate(token.get_token) procedure get_fuzzy_underline( token : out token_ptr) is begin if valid_ending(look_ahead_char) then token := new token_record(is_a => underline); else error(pointer,"identifiers may not begin with an underline"); token := new token_record(is_a => null_token); end if; end get_fuzzy_underline;